This report explores possible visualization solution for logistic regression modeling.

(I) Exposition

This report is a record of interaction with a data transfer object (dto) produced by ./manipulation/0-ellis-island.R.

The next section recaps this script, exposes the architecture of the DTO, and demonstrates the language of interacting with it.

(I.A) Ellis Island

All data land on Ellis Island.

The script 0-ellis-island.R is the first script in the analytic workflow. It accomplished the following:

    1. Reads in raw data files from the candidate studies
    1. Extract, combines, and exports their metadata (specifically, variable names and labels, if provided) into ./data/shared/derived/meta-data-live.csv, which is updated every time Ellis Island script is executed.
    1. Augments raw metadata with instructions for renaming and classifying variables. The instructions are provided as manually entered values in ./data/shared/meta-data-map.csv. They are used by automatic scripts in later harmonization and analysis.
    1. Combines unit and metadata into a single DTO to serve as a starting point to all subsequent analyses.
# load the product of 0-ellis-island.R,  a list object containing data and metadata
dto <- readRDS("./data/unshared/derived/dto.rds")
# the list is composed of the following elements
names(dto)
[1] "studyName" "filePath"  "unitData"  "metaData" 
# 1st element - names of the studies as character vector
dto[["studyName"]]
[1] "alsa"  "lbsl"  "satsa" "share" "tilda"
# 2nd element - file paths of the data files for each study as character vector
dto[["filePath"]]
[1] "./data/unshared/raw/ALSA-Wave1.Final.sav"         "./data/unshared/raw/LBSL-Panel2-Wave1.Final.sav" 
[3] "./data/unshared/raw/SATSA-Q3.Final.sav"           "./data/unshared/raw/SHARE-Israel-Wave1.Final.sav"
[5] "./data/unshared/raw/TILDA-Wave1.Final.sav"       
# 3rd element - is a list object containing the following elements
names(dto[["unitData"]])
[1] "alsa"  "lbsl"  "satsa" "share" "tilda"
# each of these elements is a raw data set of a corresponding study, for example
dplyr::tbl_df(dto[["unitData"]][["lbsl"]]) 
Source: local data frame [656 x 38]

        id AGE94 SEX94  MSTAT94 EDUC94     NOWRK94  SMK94                                         SMOKE
     (int) (int) (int)   (fctr)  (int)      (fctr) (fctr)                                        (fctr)
1  4001026    68     1 divorced     16 no, retired     no                                  never smoked
2  4012015    94     2  widowed     12 no, retired     no                                  never smoked
3  4012032    94     2  widowed     20 no, retired     no don't smoke at present but smoked in the past
4  4022004    93     2       NA     NA          NA     NA                                  never smoked
5  4022026    93     2  widowed     12 no, retired     no                                  never smoked
6  4031031    92     1  married      8 no, retired     no don't smoke at present but smoked in the past
7  4031035    92     1  widowed     13 no, retired     no don't smoke at present but smoked in the past
8  4032201    92     2       NA     NA          NA     NA don't smoke at present but smoked in the past
9  4041062    91     1  widowed      7          NA     no don't smoke at present but smoked in the past
10 4042057    91     2       NA     NA          NA     NA                                            NA
..     ...   ...   ...      ...    ...         ...    ...                                           ...
Variables not shown: ALCOHOL (fctr), WINE (int), BEER (int), HARDLIQ (int), SPORT94 (int), FIT94 (int), WALK94 (int),
  SPEC94 (int), DANCE94 (int), CHORE94 (int), EXCERTOT (int), EXCERWK (int), HEIGHT94 (int), WEIGHT94 (int), HWEIGHT
  (int), HHEIGHT (int), SRHEALTH (fctr), smoke_now (lgl), smoked_ever (lgl), year_of_wave (dbl), age_in_years (dbl),
  year_born (dbl), female (lgl), marital (chr), educ3 (chr), current_work_2 (lgl), current_drink (lgl), sedentary
  (lgl), poor_health (lgl), bmi (dbl)

Meta

# 4th element - a dataset names and labels of raw variables + added metadata for all studies
dto[["metaData"]] %>% dplyr::select(study_name, name, item, construct, type, categories, label_short, label) %>%
  DT::datatable(
    class   = 'cell-border stripe',
    caption = "This is the primary metadata file. Edit at `./data/shared/meta-data-map.csv",
    filter  = "top",
    options = list(pageLength = 6, autoWidth = TRUE)
  )

Combined data

dmls <- list() # dummy list
for(s in dto[["studyName"]]){
  ds <- dto[["unitData"]][[s]] # get study data from dto
  (varnames <- names(ds)) # see what variables there are
  (get_these_variables <- c("id",
                            "year_of_wave","age_in_years","year_born",
                            "female",
                            "marital",
                            "educ3",
                            "smoke_now","smoked_ever",
                            "current_work_2",
                            "current_drink",
                            "sedentary",
                            "poor_health",
                            "bmi")) 
  (variables_present <- varnames %in% get_these_variables) # variables on the list
  dmls[[s]] <- ds[,variables_present] # keep only them
}
lapply(dmls, names) # view the contents of the list object
$alsa
 [1] "id"             "smoke_now"      "smoked_ever"    "year_of_wave"   "age_in_years"   "year_born"     
 [7] "female"         "marital"        "educ3"          "current_work_2" "current_drink"  "sedentary"     
[13] "poor_health"    "bmi"           

$lbsl
 [1] "id"             "smoke_now"      "smoked_ever"    "year_of_wave"   "age_in_years"   "year_born"     
 [7] "female"         "marital"        "educ3"          "current_work_2" "current_drink"  "sedentary"     
[13] "poor_health"    "bmi"           

$satsa
 [1] "id"             "smoke_now"      "smoked_ever"    "year_of_wave"   "age_in_years"   "year_born"     
 [7] "female"         "marital"        "educ3"          "current_work_2" "current_drink"  "sedentary"     
[13] "poor_health"    "bmi"           

$share
 [1] "id"             "smoke_now"      "smoked_ever"    "year_of_wave"   "year_born"      "age_in_years"  
 [7] "female"         "marital"        "educ3"          "current_work_2" "current_drink"  "sedentary"     
[13] "poor_health"    "bmi"           

$tilda
 [1] "id"             "smoke_now"      "smoked_ever"    "year_of_wave"   "age_in_years"   "year_born"     
 [7] "female"         "marital"        "educ3"          "current_work_2" "current_drink"  "sedentary"     
[13] "poor_health"    "bmi"           
ds <- plyr::ldply(dmls,data.frame,.id = "study_name")
ds$id <- 1:nrow(ds) # some ids values might be identical, replace
head(ds)
  study_name id smoke_now smoked_ever year_of_wave age_in_years year_born female   marital                 educ3
1       alsa  1     FALSE       FALSE         1992           86      1906  FALSE mar_cohab more than high school
2       alsa  2     FALSE       FALSE         1992           78      1914   TRUE mar_cohab           high school
3       alsa  3     FALSE       FALSE         1992           89      1903   TRUE   widowed           high school
4       alsa  4     FALSE       FALSE         1992           78      1914  FALSE   widowed           high school
5       alsa  5     FALSE       FALSE         1992           85      1907  FALSE   widowed more than high school
6       alsa  6     FALSE       FALSE         1992           92      1900   TRUE   widowed           high school
  current_work_2 current_drink sedentary poor_health bmi
1          FALSE          TRUE     FALSE       FALSE  NA
2          FALSE          TRUE     FALSE       FALSE  NA
3          FALSE          TRUE      TRUE       FALSE  NA
4           TRUE          TRUE     FALSE        TRUE  NA
5          FALSE          TRUE     FALSE       FALSE  NA
6          FALSE          TRUE      TRUE       FALSE  NA

Basic info

# age summary across studies
ds %>%  
  dplyr::group_by(study_name) %>%
  na.omit() %>% 
  dplyr::summarize(mean_age = round(mean(age_in_years),1),
                   sd_age = round(sd(age_in_years),2),
                   observed = n(),
                   min_born = min(year_born),
                   med_born = median(year_born),
                   max_born = max(year_born)) 
Source: local data frame [4 x 7]

  study_name mean_age sd_age observed min_born med_born max_born
      (fctr)    (dbl)  (dbl)    (int)    (dbl)    (dbl)    (dbl)
1       lbsl     68.0  13.11      516     1900     1925     1964
2      satsa     63.1  12.67     1283     1900     1925     1998
3      share     62.0   9.97     2481     1911     1944     1967
4      tilda     62.6   9.12     4531     1929     1947     1960
# see counts across age groups and studies 
t <- table(
  cut(ds$age_in_years,breaks = c(-Inf,seq(from=40,to=100,by=5), Inf)),
  ds$study_name, 
  useNA = "always"
); t[t==0] <- "."; t
            
             alsa lbsl satsa share tilda <NA>
  (-Inf,40]  .    25   73    8     .     .   
  (40,45]    .    25   69    39    .     .   
  (45,50]    .    30   114   221   663   .   
  (50,55]    .    45   162   526   1637  .   
  (55,60]    .    28   126   489   1590  .   
  (60,65]    13   87   168   361   1388  .   
  (65,70]    258  101  222   389   1138  .   
  (70,75]    552  81   235   263   884   .   
  (75,80]    513  67   198   162   1192  .   
  (80,85]    425  110  96    98    .     .   
  (85,90]    254  43   28    33    .     .   
  (90,95]    58   13   4     6     .     .   
  (95,100]   12   1    1     .     .     .   
  (100, Inf] 2    .    .     .     .     .   
  <NA>       .    .    1     3     12    .   
# basic counts
table(ds$study_name, ds$smoke_now, useNA = "always")
       
        FALSE TRUE <NA>
  alsa   1851  217   19
  lbsl    480   71  105
  satsa  1067  365   65
  share  2186  408    4
  tilda  6939 1564    1
  <NA>      0    0    0
table(ds$study_name, ds$smoked_ever, useNA = "always")
       
        FALSE TRUE <NA>
  alsa   1851  217   19
  lbsl    207  351   98
  satsa   700  696  101
  share  1542 1052    4
  tilda  3726 4777    1
  <NA>      0    0    0
table(ds$study_name, ds$female, useNA = "always")
       
        FALSE TRUE <NA>
  alsa   1056 1031    0
  lbsl    314  342    0
  satsa   610  887    0
  share  1139 1459    0
  tilda  3780 4724    0
  <NA>      0    0    0
table(ds$study_name, ds$marital, useNA = "always")
       
        mar_cohab sep_divorced single widowed <NA>
  alsa       1367           49     76     594    1
  lbsl        326           77     22     134   97
  satsa       961          113    149     259   15
  share      2049          159     51     336    3
  tilda      5966          552    791    1195    0
  <NA>          0            0      0       0    0
table(ds$study_name, ds$educ3, useNA = "always")
       
        high school less than high school more than high school <NA>
  alsa          819                   337                   905   26
  lbsl          170                    74                   310  102
  satsa         121                  1239                   109   28
  share         889                   965                   717   27
  tilda        2795                  5222                   483    4
  <NA>            0                     0                     0    0

Study as factor

d <- ds %>% 
  dplyr::select_("id", "study_name", "smoke_now", 
                 "age_in_years", "female", "marital", "educ3","poor_health") %>% 
  na.omit()
mdl <- glm(
  formula = smoke_now ~ -1 + study_name + age_in_years + female + marital + educ3 + poor_health,
  data = d, family = "binomial"
);summary(mdl)

Call:
glm(formula = smoke_now ~ -1 + study_name + age_in_years + female + 
    marital + educ3 + poor_health, family = "binomial", data = d)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4099  -0.6540  -0.5434  -0.4232   2.5534  

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
study_namealsa              0.99389    0.20694   4.803 1.57e-06 ***
study_namelbsl              0.71123    0.21340   3.333  0.00086 ***
study_namesatsa             1.13828    0.16995   6.698 2.11e-11 ***
study_nameshare             0.70152    0.16513   4.248 2.15e-05 ***
study_nametilda             0.85954    0.15958   5.386 7.20e-08 ***
age_in_years               -0.04168    0.00255 -16.342  < 2e-16 ***
femaleTRUE                 -0.17909    0.04534  -3.950 7.81e-05 ***
maritalsep_divorced         0.65913    0.07874   8.371  < 2e-16 ***
maritalsingle               0.24404    0.08254   2.956  0.00311 ** 
maritalwidowed              0.37548    0.06988   5.374 7.72e-08 ***
educ3less than high school  0.21996    0.05219   4.215 2.50e-05 ***
educ3more than high school -0.30567    0.07903  -3.868  0.00011 ***
poor_healthTRUE             0.35032    0.04857   7.213 5.49e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 20836  on 15030  degrees of freedom
Residual deviance: 13256  on 15017  degrees of freedom
AIC: 13282

Number of Fisher Scoring iterations: 4
d$smoke_now_p <- predict(mdl)
graph_logistic_point_complex_4(
  ds = d, 
  x_name = "age_in_years", 
  y_name = "smoke_now_p", 
  covar_order = c("female","marital","educ3","poor_health"),
  alpha_level = .3) 

graph_logitstic_curve_complex_4(
  ds = d, 
  x_name = "age_in_years", 
  y_name = "smoke_now", 
  covar_order = c("female","marital","educ3","poor_health"),
  alpha_level = .3) 

Study as cluster

model_outcome <- "smoke_now"
model_predictors <- c("age_in_years", "female", "marital", "educ3","poor_health")

ml <- list() # create a model list object to contain model estimation and modeled data
for(s in dto[["studyName"]]){
  d <- dto[['unitData']][[s]] %>% 
    dplyr::select_(.dots=c("id",model_outcome, model_predictors)) %>% 
    na.omit()
  mdl <- stats::glm( # fit model
    formula = smoke_now ~ age_in_years +female + marital + educ3 + poor_health ,
    data = d, family="binomial"
  ); summary(mdl); 
  modeled_response_name <- paste0(model_outcome,"_p")
  d[,modeled_response_name] <- predict(mdl)
  ml[["data"]][[s]] <- d
  ml[["model"]][[s]] <- mdl
}
# names(ml[["data"]])
# names(ml[["model"]])

d <- plyr::ldply(ml[["data"]],data.frame,.id = "study_name")
d$id <- 1:nrow(d) # some ids values might be identical, replace
head(d)
  study_name id smoke_now age_in_years female   marital                 educ3 poor_health smoke_now_p
1       alsa  1     FALSE           86  FALSE mar_cohab more than high school       FALSE   -2.484083
2       alsa  2     FALSE           78   TRUE mar_cohab           high school       FALSE   -2.685699
3       alsa  3     FALSE           89   TRUE   widowed           high school       FALSE   -2.988931
4       alsa  4     FALSE           78  FALSE   widowed           high school        TRUE   -1.711330
5       alsa  5     FALSE           85  FALSE   widowed more than high school       FALSE   -2.130293
6       alsa  6     FALSE           92   TRUE   widowed           high school       FALSE   -3.153186
graph_logistic_point_complex_4(
  ds = d, 
  x_name = "age_in_years", 
  y_name = "smoke_now_p", 
  covar_order = c("female","marital","educ3","poor_health"),
  alpha_level = .3) 

graph_logitstic_curve_complex_4(
  ds = d, 
  x_name = "age_in_years", 
  y_name = "smoke_now", 
  covar_order = c("female","marital","educ3","poor_health"),
  alpha_level = .3)